Regression on handcounted images

linear_threshold <- 46

# Load data for handcounts
HC <- suppressWarnings(
  read_excel("../../Data/Processed/hd_hand_counted.xlsx") %>% 
  dplyr::select(camera_id, handcount)
)

# Load list of bad images that were handcounted
bad_images <- read_excel("../../Data/Processed/bad_images.xlsx") %>% 
  filter(status == "bad") %>% 
  rename(camera_id = cameraid)

# Drop bad images
HC <- HC %>%
  filter(!(camera_id %in% bad_images$camera_id))

linear_area <- read_csv("../../Data/Processed/area_summation_HC_linear_fine.csv",
                       col_types = "cii") %>% 
  filter(lower_thresh == linear_threshold) %>% 
  dplyr::select(-lower_thresh) %>% 
  rename(area_linear = area)

# Join handcounts with areas
HC <- inner_join(HC, linear_area) %>% 
  as.data.frame()
## Joining, by = "camera_id"

Check these areas: should be in the test_case batch but aren’t.

HC %>% 
  filter(is.na(area_linear))
## [1] camera_id   handcount   area_linear
## <0 rows> (or 0-length row.names)

Examine possibly bad images

plot_jpeg = function(path, add=FALSE) {
  # https://stackoverflow.com/a/28729601/168137
  require('jpeg')
  jpg = readJPEG(path, native = TRUE) # read the file
  res = dim(jpg)[2:1] # get the resolution, [x, y]
  if (!add) # initialize an empty plot area if add==FALSE
    plot(1, 1,
         xlim = c(1, res[1]), ylim = c(1, res[2]),
         asp = 1, type = 'n', xaxs = 'i',
         yaxs = 'i', xaxt = 'n', yaxt = 'n',
         xlab = '', ylab = '', bty = 'n', main = path)
  rasterImage(jpg, 1, 1, res[1], res[2])
}

flagged_images <- HC %>% 
  filter(area_linear > 2.5e05)

basedir <- "../../../h2_fec_images/h2_thresh_images/"

if (nrow(flagged_images) >= 1) {
  for (ii in 1:nrow(flagged_images)) {
    print(plot_jpeg(paste0(basedir, flagged_images$camera_id[ii])))
  }
}

Prediction for all images

Linear model predicting egg count from area:

fm_lm <- lm(handcount ~ area_linear - 1, data = HC)
new_data <- tibble(
  area_linear = seq(0, 1.25 * max(c(HC$area_linear)),
                    length.out = 200))

new_data$lm_pred <- predict(fm_lm, newdata = new_data)

ggplot() +
  geom_line(data = new_data, aes(x = area_linear, y = lm_pred)) +
  geom_point(data = HC, aes(x = area_linear, y = handcount))

Evaluation of linear model

HC$lm_pred <- predict(fm_lm)

cor(HC$handcount, HC$lm_pred)
## [1] 0.8127161

Predict for new data

suppressWarnings(
  M <- read_excel("../../Data/Processed/feclife_with-image-ids.xlsx")
)

# Load areas
h2_fec_areas <- read_csv("../../Data/Processed/area_summation_linear_h2_fecimages.csv",
                         col_types = "cii")

M <- left_join(M, h2_fec_areas) %>% 
  rename(area_linear = area) %>% 
    drop_na(area_linear)
## Joining, by = "camera_id"
M$predicted_count_linear <- predict(fm_lm, newdata = M)

Checks

Check these. They are handcounted but in the set of analyzed images.

M %>% 
  filter(is.na(test_case)) %>% 
  filter(!is.na(handcount)) %>% 
  filter(handcount != "unassessable") %>% 
  filter(handcount != "unassigned") %>% 
  mutate(handcount = as.numeric(handcount)) %>% 
  select(camera_id, handcount, predicted_count_linear) %>% 
  filter(!is.na(predicted_count_linear))
## # A tibble: 8 x 3
##   camera_id    handcount predicted_count_linear
##   <chr>            <dbl>                  <dbl>
## 1 IMG_3720.JPG    956                    1325  
## 2 IMG_3780.JPG    900                    1456  
## 3 IMG_5041.JPG      3.00                   23.3
## 4 IMG_5701.JPG      8.00                   36.6
## 5 IMG_5706.JPG      0                      39.3
## 6 IMG_5707.JPG      0                      88.2
## 7 IMG_5730.JPG      7.00                  120  
## 8 IMG_5731.JPG     26.0                   108

Some exploration

ggplot(M, aes(predicted_count_linear)) +
  geom_histogram(bins = 30)

Check high and low predicted eggs

M %>% 
  select(camera_id, predicted_count_linear) %>% 
  arrange(desc(predicted_count_linear))
## # A tibble: 3,437 x 2
##    camera_id    predicted_count_linear
##    <chr>                         <dbl>
##  1 IMG_3756.JPG                   3265
##  2 IMG_2841.JPG                   3013
##  3 IMG_2198.JPG                   2933
##  4 IMG_2807.JPG                   2915
##  5 IMG_2811.JPG                   2756
##  6 IMG_2791.JPG                   2755
##  7 IMG_2813.JPG                   2748
##  8 IMG_2781.JPG                   2723
##  9 IMG_1880.JPG                   2714
## 10 IMG_2810.JPG                   2671
## # ... with 3,427 more rows
M %>% 
  select(camera_id, predicted_count_linear) %>% 
  arrange(predicted_count_linear)
## # A tibble: 3,437 x 2
##    camera_id    predicted_count_linear
##    <chr>                         <dbl>
##  1 IMG_2898.JPG                  0.129
##  2 IMG_4209.JPG                  0.198
##  3 IMG_5615.JPG                  0.555
##  4 IMG_5379.JPG                  0.631
##  5 IMG_4198.JPG                  0.761
##  6 IMG_5610.JPG                  0.806
##  7 IMG_5388.JPG                  1.10 
##  8 IMG_5397.JPG                  1.10 
##  9 IMG_5362.JPG                  1.21 
## 10 IMG_5519.JPG                  1.41 
## # ... with 3,427 more rows
M.low <- M %>% 
 select(camera_id, predicted_count_linear) %>% 
 arrange(predicted_count_linear)

M.low <- M.low[1:300,]

ee <- read_excel("../../Data/Processed/egg_notes_EGK.xlsx")
ee$Image <- paste("IMG_", ee$Image, ".JPG", sep = "")
ww <- which(M.low$camera_id %in% ee$Image)

M.low <- M.low[-ww,]

set.seed <- 8462
set.zeros <- M.low[sample(seq(1, nrow(M.low)), 10) , ]

Examine possibly bad images

plot_jpeg = function(path, add=FALSE) {
  # https://stackoverflow.com/a/28729601/168137
  require('jpeg')
  jpg = readJPEG(path, native = TRUE) # read the file
  res = dim(jpg)[2:1] # get the resolution, [x, y]
  if (!add) # initialize an empty plot area if add==FALSE
    plot(1, 1,
         xlim = c(1, res[1]), ylim = c(1, res[2]),
         asp = 1, type = 'n', xaxs = 'i',
         yaxs = 'i', xaxt = 'n', yaxt = 'n',
         xlab = '', ylab = '', bty = 'n', main = path)
  rasterImage(jpg, 1, 1, res[1], res[2])
}

flagged_images <- M %>% 
  filter(predicted_count_linear > 2000)

basedir <- "../../../h2_fec_images/h2_thresh_images/"

if (nrow(flagged_images) >= 1) {
  for (ii in 1:nrow(flagged_images)) {
    print(plot_jpeg(paste0(basedir, flagged_images$camera_id[ii])))
  }
}
## Loading required package: jpeg

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

## NULL

Save output for further filtering & analysis

save(M, file= "../../Data/Processed/predicted_egg_counts.rda")